# Load required libraries
require(mlr3learners)
## Loading required package: mlr3learners
## Warning: package 'mlr3learners' was built under R version 4.4.1
## Loading required package: mlr3
## Warning: package 'mlr3' was built under R version 4.4.1
require(kknn)
## Loading required package: kknn
require(class)
## Loading required package: class
require(ggplot2)
## Loading required package: ggplot2
library(mlr3)
library(mlr3viz)
## Warning: package 'mlr3viz' was built under R version 4.4.1
library(ggplot2)
library(data.table)
## Warning: package 'data.table' was built under R version 4.4.1
library(sqldf)
## Loading required package: gsubfn
## Warning: package 'gsubfn' was built under R version 4.4.1
## Loading required package: proto
## Warning: package 'proto' was built under R version 4.4.1
## Warning in doTryCatch(return(expr), name, parentenv, handler): unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
## dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 0x0006): Library not loaded: /opt/X11/lib/libSM.6.dylib
## Referenced from: <9A3F5E83-2A35-33C3-9C5A-5255B116A1BE> /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/modules/R_X11.so
## Reason: tried: '/opt/X11/lib/libSM.6.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/X11/lib/libSM.6.dylib' (no such file), '/opt/X11/lib/libSM.6.dylib' (no such file), '/Library/Frameworks/R.framework/Resources/lib/libSM.6.dylib' (no such file), '/Library/Java/JavaVirtualMachines/jdk-11.0.18+10/Contents/Home/lib/server/libSM.6.dylib' (no such file)
## tcltk DLL is linked to '/opt/X11/lib/libX11.6.dylib'
## Could not load tcltk. Will use slower R code instead.
## Loading required package: RSQLite
library(codebookr)
library(codebook)
## Warning: package 'codebook' was built under R version 4.4.1
##
## Attaching package: 'codebook'
## The following object is masked from 'package:codebookr':
##
## codebook
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(haven)
library(labelled)
## Warning: package 'labelled' was built under R version 4.4.1
##
## Attaching package: 'labelled'
## The following object is masked from 'package:codebook':
##
## to_factor
library(readr)
library(tibble)
# Step 1: Read raw data files (coded values + labels)
data_values <- read_csv("~/Desktop/GRIT/Kibera/originalCsv/kibera_labels.data.csv")
## New names:
## • `2.19` -> `2.19...48`
## • `2.19` -> `2.19...49`
## • `2.19` -> `2.19...50`
## Rows: 522 Columns: 171
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (171): StartDate, EndDate, Status, IPAddress, Progress, Duration (in sec...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_labels <- read_csv("~/Desktop/GRIT/Kibera/originalCsv/kibera_values_data.csv")
## Rows: 522 Columns: 171
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (171): StartDate, EndDate, Status, IPAddress, Progress, Duration (in sec...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Step 2: Extract variable labels and drop metadata/header rows
variable_labels <- as.character(unlist(data_values[1, ]))
data_values <- data_values[-c(1, 2), ]
data_labels <- data_labels[-c(1, 2), ]
# Step 3: Rename variables that start with numbers
names(data_values) <- ifelse(grepl("^[0-9]", names(data_values)),
paste0("x", names(data_values)),
names(data_values))
names(data_labels) <- ifelse(grepl("^[0-9]", names(data_labels)),
paste0("x", names(data_labels)),
names(data_labels))
# Determine overlapping variables between data_values and data_labels
common_cols <- intersect(names(data_values), names(data_labels))
codebook_list <- list()
# Step 4: Build correct label-value mapping (Value = code, Label = text)
for (col in common_cols) {
values <- as.character(data_values[[col]]) # coded values: "1", "2", etc.
labels <- as.character(data_labels[[col]]) # text labels: "Female", "Male"
df <- data.frame(Value = values, Label = labels, stringsAsFactors = FALSE) %>%
filter(!is.na(Value), !is.na(Label)) %>%
distinct()
# ✅ Enforce correct structure: Value = code, Label = label
df <- df %>%
mutate(Value = as.character(Value), Label = as.character(Label))
if (nrow(df) > 0 && !all(df$Value == df$Label)) {
df$Variable <- col
codebook_list[[col]] <- df
}
}
# Step 5: Combine into a long-format codebook
codebook_df <- bind_rows(codebook_list) %>%
select(Variable, Value, Label)
# Step 7: Build nested dictionary for value labels
label_dict <- codebook_df %>%
group_by(Variable) %>%
summarise(mapping = list(setNames(Label, Value))) %>%
deframe()
# Step 8: Apply value labels automatically
for (var in names(label_dict)) {
if (var %in% names(data_values)) {
# Ensure variables are character so labels stick
data_values[[var]] <- as.character(data_values[[var]])
val_labels(data_values[[var]]) <- label_dict[[var]]
}
}
# Step 6: Apply variable labels (question text) to data_values
for (i in seq_along(data_values)) {
var_label(data_values[[i]]) <- variable_labels[i]
}
# Step 9: Save codebook CSV
write_csv(codebook_df, "~/Desktop/GRIT/Kibera/codebook/auto_generated_codebook.csv")
# Step 10: Optional – Generate a codebook report (interactive viewer or RMarkdown)
# Only include non-free-text variables in the summary
too_unique_vars <- names(data_values)[
sapply(data_values, function(x) is.character(x) && (length(unique(x)) > 100 || any(is.na(x))))
]
safe_vars <- setdiff(names(data_values), too_unique_vars)
codebook_output <- codebook(data_values[, safe_vars])
## No missing values.
#save the codebook
saveRDS(data_values, file = "~/Desktop/GRIT/Kibera/codebook/codebook_labelled_data.rds")